# setup: load packages

library(gKRLS)
library(tidyverse)
library(haven)
library(mgcv)
library(tidygam)
library(hardhat)
library(gridExtra)
library(patchwork)
library(scales)
library(modelsummary)
library(APCtools)

# read in cleaned data files
clean_23 <- read_csv("Clean data 2023.csv")
clean_22 <- read_csv("Clean data 2022.csv")
clean_21 <- read_csv("Clean data 2021.csv")
clean_20 <- read_csv("Clean data 2020.csv")
clean_19 <- read_csv("Clean data 2019.csv")
clean_18 <- read_csv("Clean data 2018.csv")
clean_17 <- read_csv("Clean data 2017.csv")
clean_16 <- read_csv("Clean data 2016.csv")


# rename sex variable in 2016-2018 to make data commensurable
clean_18$SEXVAR <- clean_18$SEX1
clean_18 <- clean_18 %>%
  select(-SEX1)

clean_17$SEXVAR <- clean_17$SEX
clean_17 <- clean_17 %>%
  select(-SEX)

clean_16$SEXVAR <- clean_16$SEX
clean_16 <- clean_16 %>%
  select(-c(SEX, race2))

# append dataframes
clean_data <- plyr::rbind.fill(clean_23, clean_22, clean_21, clean_20, clean_19, clean_18, clean_17, clean_16)

# delete individual dataframes to free memory
rm(clean_23, clean_22, clean_21, clean_20, clean_19, clean_18, clean_17, clean_16)

# data cleaning: create gender variables
clean_data$trans_fac <- factor(clean_data$trans,
                               levels = c(0,1),
                               labels = c("Non-Trans", "Trans"))

clean_data <- clean_data %>%
  mutate(sex_fac = case_when(transfem == 1 ~ 1,
                             transmasc == 1 ~ 2,
                             TRUE ~ SEXVAR))


# normalize sampling weights so that their mean is 1
clean_data$norm_weights <- clean_data$weights/mean(clean_data$weights)

# create approximate birth year variable and categorical year
clean_data <- clean_data %>%
  mutate(birth_year = year - age,
         year_fac = factor(year, levels = c(2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016)))


# create disability variables
clean_data <- clean_data %>%
  mutate(phys_dis = case_when(blind + deaf + walk + dress > 0 ~ 1,
                              blind + deaf + walk + dress == 0 ~ 0),
         phys_dis_fac = factor(phys_dis,
                               levels = c(0,1),
                               labels = c("No", "Yes")))
         
# create a categorical variable for birth year and remove missing data for disability
clean_data <- clean_data %>%
  filter(!is.na(phys_dis)) %>%
  mutate(birth_year_binned = case_when(birth_year < 1946 ~ "1945",
                                       birth_year < 1956 ~ "1955",
                                       birth_year < 1966 ~ "1965",
                                       birth_year < 1976 ~ "1975",
                                       birth_year < 1986 ~ "1985",
                                       birth_year < 1996 ~ "1995",
                                       birth_year < 2006 ~ "2005"))

# table a1

wtpct <- function(x, y) sum(x, na.rm = TRUE) / sum(y, na.rm = T) * 100

datasummary((`Birth cohort` = birth_year_binned) + (`Race` = race) + (`Ethnicity` = ethnicity) + (`Education` = edu) + (`Physical Disability` = phys_dis_fac) + 1 ~ (`Gender Identity` = gender_id)*(N + norm_weights*Percent(fn = wtpct, denom = "col")),
            data = clean_data,
            output = "brfss_summary.html")

# clean diabetes variable
clean_data <- clean_data %>%
  mutate(diabetes_class = factor(case_when(diabetes == 0 ~ "None",
                                           diabetes == 1 & (diabetes_type == 1 | diabetes_early == 1) ~ "Likely Type 1",
                                           diabetes == 1 ~ "Likely Type 2")))

## Figure 1 model

dis_fit <- gam(trans ~ year_fac + s(birth_year, by = phys_dis_fac,
                                           bs = "gKRLS"),
               data = clean_data,
               family = binomial(link = "logit"),
               weights = norm_weights,
               method = "REML")

# get variance-covariance matrix for cluster-robust standard errors

robust <- vcovCL(dis_fit, cluster = clean_data$id)

## list years to compute predictions for
year_5_dbl <- c(1935, 1935, 1940, 1940, 1945, 1945, 1950, 1950, 1955, 1955, 1960, 1960,
                1965, 1965, 1970, 1970, 1975, 1975, 1980, 1980, 1985, 1985, 1990, 1990,
                1995, 1995, 2000, 2000, 2002, 2002, 2004, 2004, 2005, 2005)


## get predictions based on model
dis_effects <- calculate_effects(dis_fit,
                  variables = c("birth_year"),
                  conditional = data.frame("birth_year" = year_5_dbl,
                                           "year_fac" = "2023"),
                  vcov = robust,
                  continuous_type = "predict",
                  verbose = T)

write_csv(dis_effects, "dis_effects.csv")

## Figure 2: Diabetes analysis

diabetes_fit <- gam(trans ~ year_fac + s(birth_year, by = diabetes_class,
                                         bs = "gKRLS"),
                    data = clean_data,
                    family = binomial(link = "logit"),
                    weights = norm_weights,
                    method = "REML")

# years to get predictions for
year_5_three <- c(1935, 1935, 1935, 1940, 1940, 1940, 1945, 1945, 1945, 1950, 1950, 1950,
                1955, 1955, 1955, 1960, 1960, 1960, 1965, 1965, 1965, 1970, 1970, 1970,
                1975, 1975, 1975, 1980, 1980, 1980, 1985, 1985, 1985, 1990, 1990, 1990,
                1995, 1995, 1995, 2000, 2000, 2000, 2002, 2002, 2002, 2004, 2004, 2004)

# get var-covar matrix
robust_dia <- vcovCL(diabetes_fit, cluster = clean_data$id)

# get predictions
diabetes_effects <- calculate_effects(diabetes_fit,
                                      variables = c("birth_year"),
                                      conditional = data.frame("birth_year" = year_5_three,
                                                               "diabetes_class" = c("Likely Type 1",
                                                                                    "Likely Type 2",
                                                                                    "None"),
                                                               "year_fac" = "2023"),
                                      vcov = robust_dia,
                                      continuous_type = "predict",
                                      verbose = T)

write_csv(diabetes_effects, "diabetes_effects.csv")

# figure 4

## fit model for people assigned female at birth
dis_fit_afab <- gam(trans ~ year_fac + s(birth_year, by = phys_dis_fac,
                                           bs = "gKRLS"),
               data = clean_data[which(clean_data$sex_fac == 2),],
               family = binomial(link = "logit"),
               weights = norm_weights,
               method = "REML")

## fit model for people assigned male at birth
dis_fit_amab <- gam(trans ~ year_fac + s(birth_year, by = phys_dis_fac,
                                                bs = "gKRLS"),
                    data = clean_data[which(clean_data$sex_fac == 1),],
                    family = binomial(link = "logit"),
                    weights = norm_weights,
                    method = "REML")

## get variance-covariance matrices for afab and amab respondents
robust_afab <- vcovCL(dis_fit_afab, cluster = clean_data[which(clean_data$sex_fac == 2),]$id)

robust_amab <- vcovCL(dis_fit_amab, cluster = clean_data[which(clean_data$sex_fac == 1),]$id)

## compute predicted values for afab and amab models
afab_effects <- calculate_effects(dis_fit_afab,
                                 variables = c("birth_year"),
                                 conditional = data.frame("birth_year" = year_5_dbl,
                                                          "phys_dis_fac" = c("No", "Yes"),
                                                          "year_fac" = "2023"),
                                 vcov = robust_afab,
                                 continuous_type = "predict",
                                 verbose = T)

amab_effects <- calculate_effects(dis_fit_amab,
                                  variables = c("birth_year"),
                                  conditional = data.frame("birth_year" = year_5_dbl,
                                                           "phys_dis_fac" = c("No", "Yes"),
                                                           "year_fac" = "2023"),
                                  vcov = robust_amab,
                                  continuous_type = "predict",
                                  verbose = T)

write_csv(afab_effects, "afab_effects.csv")
write_csv(amab_effects, "amab_effects.csv")


# rename variables for age period cohort analysis
clean_data$period <- clean_data$year
clean_data$cohort <- clean_data$birth_year

# create separate data frames for people with/without disabilities

dis_df <- clean_data %>% # disabled
  filter(phys_dis == 1)

ndis_df <- clean_data %>% # non-disabled
  filter(phys_dis == 0)

# fit model only on disabled df
model_dis <- gam(trans ~ te(age, cohort, bs = "cs"),
                family = binomial(link = "logit"),
                 weights = norm_weights,
                 data = dis_df)

# fit model only on non-disabled df
model_ndis <- gam(trans ~ te(age, cohort),
                 family = binomial(link = "logit"),
                 weights = norm_weights,
                 data = ndis_df)

## Run APC heatmap_diff.R to get this function;


plot_APCheatmap_diff(model1 = model_dis, # Figure 3
                     model2 = model_ndis,
                     dat = clean_data,
                     bin_heatmap = F,
                     markLines_list = list("cohort" = c(1940, 1950, 1960,
                                                        1970, 1980, 1990, 2000)),
                     plot_CI = F,
                     legend_limits = c(0.5, 5))

# appendix C
clean_data <- clean_data %>%
  mutate(blind_fac = factor(blind, levels = c(0,1), labels = c("No", "Yes")),
         deaf_fac = factor(deaf, levels = c(0,1), labels = c("No", "Yes")),
         walk_fac = factor(walk, levels = c(0,1), labels = c("No", "Yes")),
         dress_fac = factor(dress, levels = c(0,1), labels = c("No", "Yes")))


dis_fit_blind <- gam(trans ~ year_fac + s(birth_year, by = blind_fac,
                                          bs = "gKRLS"),
                     data = clean_data,
                     family = binomial(link = "logit"),
                     weights = norm_weights,
                     method = "REML")

dis_fit_deaf <- gam(trans ~ year_fac + s(birth_year, by = deaf_fac,
                                         bs = "gKRLS"),
                    data = clean_data,
                    family = binomial(link = "logit"),
                    weights = norm_weights,
                    method = "REML")

dis_fit_walk <- gam(trans ~ year_fac + s(birth_year, by = walk_fac,
                                         bs = "gKRLS"),
                    data = clean_data,
                    family = binomial(link = "logit"),
                    weights = norm_weights,
                    method = "REML")

dis_fit_dress <- gam(trans ~ year_fac + s(birth_year, by = dress_fac,
                                          bs = "gKRLS"),
                     data = clean_data,
                     family = binomial(link = "logit"),
                     weights = norm_weights,
                     method = "REML")

## get variance-covariance matrix for each model
robust_blind <- vcovCL(dis_fit_blind, clean_data$id)

robust_deaf <- vcovCL(dis_fit_deaf, clean_data$id)

robust_walk <- vcovCL(dis_fit_walk, clean_data$id)

robust_dress <- vcovCL(dis_fit_dress, clean_data$id)

## get predicted values for each model
blind_effects <- calculate_effects(dis_fit_blind,
                                   variables = c("birth_year"),
                                   conditional = data.frame("birth_year" = year_5_dbl,
                                                            "blind_fac" = c("No", "Yes"),
                                                            "year_fac" = "2023"),
                                   vcov = robust_blind,
                                   continuous_type = "predict",
                                   verbose = T)

write_csv(blind_effects, "brfss_blind.csv")

deaf_effects <- calculate_effects(dis_fit_deaf,
                                  variables = c("birth_year"),
                                  conditional = data.frame("birth_year" = year_5_dbl,
                                                           "deaf_fac" = c("No", "Yes"),
                                                           "year_fac" = "2023"),
                                  vcov = robust_deaf,
                                  continuous_type = "predict",
                                  verbose = T)

write_csv(deaf_effects, "brfss_deaf.csv")

walk_effects <- calculate_effects(dis_fit_walk,
                                  variables = c("birth_year"),
                                  conditional = data.frame("birth_year" = year_5_dbl,
                                                           "walk_fac" = c("No", "Yes"),
                                                           "year_fac" = "2023"),
                                  vcov = robust_walk,
                                  continuous_type = "predict",
                                  verbose = T)

write_csv(walk_effects, "brfss_walk.csv")

dress_effects <- calculate_effects(dis_fit_dress,
                                   variables = c("birth_year"),
                                   conditional = data.frame("birth_year" = year_5_dbl,
                                                            "dress_fac" = c("No", "Yes"),
                                                            "year_fac" = "2023"),
                                   vcov = robust_dress,
                                   continuous_type = "predict",
                                   verbose = T)

write_csv(dress_effects, "brfss_dress.csv")

